library(DBI)
library(broom)
library(splines)
library(stringr)
library(dplyr)
library(ggplot2)
nhanes.sqlite can download here
or generate by run notebooksource("../6_nhanes_data/phesant.R")
exposure_vars <-
read.delim("../data/select_vars.txt", header = FALSE)$V1
exposure_cols <- paste(exposure_vars, collapse = ", ")
# Load data and convert data types according to PHESEANT.
# Return data set and PHESEANT results.
load_data <- function(exposure_cols) {
nhanes_db <- dbConnect(RSQLite::SQLite(), "../nhanes.sqlite")
cols <-
'BMXWAIST, RIDAGEYR, BMXHT, BMXBMI, BMXWT, RIAGENDR,SDMVPSU,SDMVSTRA, years,'
data_sql <-
paste('SELECT', cols, exposure_cols, 'FROM merged_table')
data <- dbGetQuery(nhanes_db, data_sql)
data <- na.omit(data)
dbDisconnect(nhanes_db)
phs_res <- phesant(data)
data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])] <-
lapply(data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])], as.factor)
data[, names(phs_res[phs_res == "continuous"])] <-
lapply(data[, names(phs_res[phs_res == "continuous"])], as.numeric)
return(list(data = data, phs_res = phs_res))
}
data_phs <- load_data(exposure_cols)
data <- data_phs$data
phs_res <- data_phs$phs_res
invNorm <- function(x) {qnorm((rank(x) - 3/8)/(length(x) +1 - 6/8))}
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
build_formula <- function(base_model,exposure_var,inv=FALSE) {
var <- "+ %s"
if(inv) var <- " + invNorm(%s)"
as.formula(
sprintf(paste0(base_model, var),
exposure_var
)
)
}
enwas <-
function(base_model,
exposure_vars,
data_set,
inv_norm = FALSE) {
num_var <- length(exposure_vars)
model_list <- vector(mode = "list", length = num_var)
num_cols <- sapply(data_set[, exposure_vars], is.numeric)
num_cols <- names(num_cols[num_cols == TRUE])
if (inv_norm) {
data_set[, num_cols] <- lapply(data_set[, num_cols], invNorm)
}
association_list <- data.frame()
factor_terms <- c() # to hold the factors terms
num_var <- length(exposure_vars)
for (i in 1:num_var) {
exposure <- exposure_vars[i]
if (is.factor(data_set[, exposure])) {
factor_terms <-
c(factor_terms, paste0(exposure, levels(data_set[, exposure])))
}
model <- build_formula(base_model, exposure)
mod <- lm(model, data_set)
model_list[[i]] <- mod
mod_df <- tidy(mod)
association_list <-
rbind(association_list, mod_df) # step 3b of algorithm
}
xwas_list <-
association_list[association_list$term %in% c(exposure_vars, factor_terms), ]
# sd_x_list <- sapply(data_set[,num_cols],sd)
sd_x_list <- sapply(data_set, function(x)
sd(as.numeric(x)))
for (var in exposure_vars) {
xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] <-
xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] * sd_x_list[var]
}
xwas_list$fdr <-
p.adjust(xwas_list$p.value, method = 'BY')
xwas_list <- xwas_list |>
mutate(lower = estimate - 1.96 * std.error) |>
mutate(upper = estimate + 1.96 * std.error)
return (list(model_list = model_list, enwas_res = xwas_list))
}
# forest_plot <- function(xwas_result) {
# xwas_result |>
# ggplot(aes(
# x = term,
# y = estimate,
# ymin = lower,
# ymax = upper,
# colour = estimate
# )) +
# geom_pointrange() +
# coord_flip() + # flip coordinates (puts labels on y axis)
# xlab("Exposures") + ylab("Estimate (95% CI)") +
# theme_bw() # use a white background
# }
forest_plot <- function(xwas_result) {
xwas_result |>
ggplot(aes(
x = term,
y = estimate,
colour = estimate
)) +
geom_point(size = 2, position = position_dodge(width = 1)) +
geom_errorbar(aes(ymin = lower, ymax = upper),
position = position_dodge(width = 1), width=0.5,cex=1) +
geom_hline(yintercept = 0, linetype = 'dashed') +
coord_flip() + # flip coordinates (puts labels on y axis)
xlab("Exposures") + ylab("Estimate (95% CI)") +
theme_bw() # use a white background
}
forest_plot_mult <- function(xwas_result_list) {
xwas_result <- do.call("rbind", xwas_result_list)
xwas_result$EnWAS <-
rep(names(xwas_result_list), each = nrow(xwas_result_list[[1]]))
xwas_result |>
ggplot(aes(
x = term,
y = estimate,
colour = EnWAS
)) +
geom_point(size = 2, position = position_dodge(width = 1)) +
geom_errorbar(aes(ymin = lower, ymax = upper),
position = position_dodge(width = 1), width=0.5,cex=1) +
geom_hline(yintercept = 0, linetype = 'dashed') +
coord_flip() + # flip coordinates (puts labels on y axis)
xlab("Exposures") + ylab("Estimate (95% CI)") +
theme_bw() # use a white background
}
linear_model <- 'BMXWAIST ~ RIDAGEYR*RIAGENDR + BMXBMI'
linear_res2 <- enwas(linear_model, exposure_vars, data)
all_ns_model <-
'BMXWAIST ~ ns(RIDAGEYR, knots = seq(30, 80, by = 10)) * RIAGENDR + ns(BMXBMI,knots = seq(30, 80, by = 10))'
all_ns_res <- enwas(all_ns_model, exposure_vars, data)
lm_inverse_res <- enwas(linear_model, exposure_vars, data,inv = TRUE)
ns_inverse_res <- enwas(all_ns_model, exposure_vars, data,inv = TRUE)
forest_plot(linear_res2$enwas_res)
forest_plot_mult(
list(
linear = linear_res2$enwas_res,
ns = all_ns_res$enwas_res
)
)
forest_plot_mult(
list(
lm_inv = lm_inverse_res$enwas_res,
ns_inv = ns_inverse_res$enwas_res
)
)
forest_plot_mult(
list(
linear = linear_res2$enwas_res,
lm_inv = lm_inverse_res$enwas_res
)
)
forest_plot_mult(
list(
ns = all_ns_res$enwas_res,
ns_inv = ns_inverse_res$enwas_res
)
)
num_cols <- sapply(data[, exposure_vars], is.numeric)
num_cols <- names(num_cols[num_cols == TRUE])
linear <- linear_res2$enwas_res
linear <- linear[linear$term %in% num_cols,]
ns <- all_ns_res$enwas_res
ns <- ns[ns$term %in% num_cols,]
lm_inv <- lm_inverse_res$enwas_res
lm_inv <- lm_inv[lm_inv$term %in% num_cols,]
ns_inv <- ns_inverse_res$enwas_res
ns_inv <- ns_inv[ns_inv$term %in% num_cols,]
forest_plot_mult(
list(
ns = ns,
linear = linear
)
)
forest_plot_mult(
list(
lm_inv = lm_inv,
ns_inv = ns_inv
)
)
forest_plot_mult(
list(
linear = linear,
lm_inv = lm_inv
)
)
forest_plot_mult(
list(
ns = ns,
ns_inv = ns_inv
)
)
xwas_res_list <- list(linear = linear,ns = ns,lm_inv = lm_inv,ns_inv = ns_inv)
xwas_result <- do.call("rbind",xwas_res_list )
xwas_result$EnWAS <-
rep(names(xwas_res_list), each = nrow(xwas_res_list[[1]]))
xwas_result$postive <- xwas_result$lower * xwas_result$upper
xwas_result$postive <- xwas_result$postive >=0
# show the false positive raised in linear base model
ns_vs_linear <- xwas_result[xwas_result$EnWAS=="linear",]$postive != xwas_result[xwas_result$EnWAS=="ns",]$postive
num_cols[ns_vs_linear]
[1] “DR1EXMER” “DR1TM201”
for(c in num_cols[ns_vs_linear]){
g <- ggplot(data, aes_string(x="RIDAGEYR", y=paste0("invNorm(",c,")"), colour="RIAGENDR")) +
geom_point(alpha=0.1)+
geom_smooth(method = lm, formula = y ~ ns(x, knots = seq(30, 80, by = 10)), size = 1)
print(g)
}
# ns vs inverse norm transformation
ns_vs_ns_inv <- xwas_result[xwas_result$EnWAS=="ns_inv",]$postive != xwas_result[xwas_result$EnWAS=="ns",]$postive
num_cols[ns_vs_ns_inv]
[1] “DR1EXMER” “DR1TPROT” “DR1TCRYP” “DR1TNIAC” “DR1TVK” “DR1TSELE”
for(c in num_cols[ns_vs_ns_inv]){
g <- ggplot(data, aes_string(x="RIDAGEYR", y=paste0("invNorm(",c,")"), colour="RIAGENDR")) +
geom_point(alpha=0.1)+
geom_smooth(method = lm, formula = y ~ ns(x, knots = seq(30, 80, by = 10)), size = 1)
print(g)
par(mfrow=c(1,2))
hist(data[,c],main = c)
hist(invNorm(data[,c]), main = paste0("invNorm:",c))
}